utterances <- get_utterances(collection = "Eng-NA")

cleanutts <- utterances %>%
  filter(speaker_role %in% c("Target_Child", "Mother","Father")) %>%
  arrange(transcript_id,utterance_order) %>%
  mutate(gloss = str_to_lower(gloss))

get_convo <- function(these_transcript_ids) {
  convo <- cleanutts %>%
    filter(transcript_id %in% these_transcript_ids, gloss != "") %>%
    select(gloss, transcript_id, utterance_order, speaker_code, 
           target_child_age, target_child_id)
  return(convo)
}

convos <- get_convo(2798) 
import io
import os
import gensim
from gensim import utils
import gensim.models
import gensim.models.word2vec
from gensim.test.utils import datapath
import numpy as np
import sklearn
import matplotlib
from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE  

model = gensim.models.Word2Vec.load("models/childes_adult_word2vec.model")

def get_vectors(convo):
  vec_length = len(model.wv['and'])
  convo_length = len(convo)
  discourse_vectors = np.zeros((convo_length, vec_length))
  for index, utt in enumerate(convo):
    sum_vector = np.zeros(vec_length)
    utt_len = len(str.split(utt))
    for word in str.split(utt):
      if word in model.wv.vocab:
        sum_vector = sum_vector + model.wv[word]
      else:
        utt_len = utt_len - 1
    if utt_len > 0:
      discourse_vectors[index] = [x/utt_len for x in sum_vector]
    else:
      discourse_vectors[index] = [0.0 for x in sum_vector]
  return discourse_vectors


def reduce_dimensions(all_convos):
  num_dimensions = 2 
  all_vectors = get_vectors(all_convos)
  vectors = TSNE(n_components=num_dimensions, random_state=0).fit_transform(all_vectors)
  x_vals = [v[0] for v in vectors]
  y_vals = [v[1] for v in vectors]
  vals = np.column_stack((x_vals, y_vals, r.convos["transcript_id"], r.convos["utterance_order"]))
  return vals


convo_vecs = reduce_dimensions(r.convos["gloss"])
vizconvos <- convos %>%
  left_join(as_tibble(py$convo_vecs), by = c("transcript_id" = "V3", 
                                             "utterance_order" = "V4")) %>%
  mutate(exchange = floor(utterance_order/4))

ggplot(vizconvos, aes(V1, V2, color = as_factor(speaker_code))) + 
  geom_point() 

ggplot(vizconvos, aes(V1, V2, color = utterance_order)) + 
  geom_point() + 
  facet_wrap(~ target_child_age)

fig <- vizconvos %>%
  plot_ly(x=~V1, y=~V2, z=~utterance_order, type="scatter3d", 
          mode="markers", marker = list(size = 3), color=~speaker_code)


fig
fig <- vizconvos %>%
  plot_ly(
    x = ~V1,
    y = ~V2,
    color = ~speaker_code,
    frame = ~exchange,
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  )
fig